/**
 * 2017年5月19日
 */
package cn.edu.bjtu.workbench.datasource.fileiter;

import java.io.File;
import java.io.IOException;
import java.util.List;

import javax.management.RuntimeErrorException;

import org.datavec.api.records.reader.impl.regex.RegexLineRecordReader;
import org.datavec.api.split.FileSplit;
import org.datavec.api.split.InputSplit;
import org.datavec.api.writable.Text;
import org.datavec.api.writable.Writable;


/**
 * 读向量
 * 所读取的数据格式
 * $2	1,http://mp.weixin.qq.com/s?__biz=MjM5ODI0MjEwMg==&mid=400516196&idx=3&sn=b48e07fa5348786748754fde150a06f8	42.61209459928796 27.143187310546637  
 * @author Alex
 *
 */
public class TransformedVecWithIdLineVectorRecordReader extends RegexLineRecordReader {
	/**
	 * 同样是只取第一个类标
	 * 默认只获取第一个类标与向量.,如果想获取其他信息请把相应的?取消,想关信息请查询正则表达式相关
	 * @param skipNumLines
	 */
	public TransformedVecWithIdLineVectorRecordReader(int skipNumLines){
		this("\\$?(\\d{1,})(?:,\\d{1,})*\\s(?:.*?)\\s(.*)",skipNumLines);
	}
	public TransformedVecWithIdLineVectorRecordReader() {
		this(0);
	}
	/**
	 * 原生构造函数
	 * @param regex
	 * @param skipNumLines
	 */
	public TransformedVecWithIdLineVectorRecordReader(String regex, int skipNumLines) {
		super(regex, skipNumLines);
	}
	
	
	private static final long serialVersionUID = -8800370644283003597L;
	public static void main(String[] args) throws IOException, InterruptedException {
		String file = "D:\\textdata\\transformdoc\\20170517092837";
		
		try(TransformedVecWithIdLineVectorRecordReader rr = new TransformedVecWithIdLineVectorRecordReader(0)){
			InputSplit is = new FileSplit(new File(file));
			rr.initialize(is);
			while(rr.hasNext()){
				List<Writable>  lw = rr.next();
				Text t0 = (Text)lw.get(0);
				Text t1 = (Text)lw.get(1);
				System.out.println(t0);
				if(t1.toString().split(" ").length != 100) 
				{
					System.out.println(t1.toString().split(" ").length);
					throw new RuntimeException();
				}
			}
		}
	}
}
